library("quanteda")
library("topicmodels")
library("readtext")
library(readr)
library(tidytext)
library(dplyr)
library(ggplot2)
library(topicdoc)
library(widyr)
library(tm)
library(tmap)

d_data <- read_csv() #open csv file composed of one collumn with data
d_data$text <- d_data$X1 # omdat corpus een variable nodig heeft die "text" heet, kopieren we deze
d_data$X1 <- NULL # we verwijderen X1
d_corpus <- corpus(d_data) # translate into corpus
d_corpus <- tolower(d_corpus)


d_tokens <- tokens(d_corpus) #split hele file op in tokens, apparte woorden

# breidt het stopwoordenboek wat uit in functie van de data
new_stopwords <- c("👏","👇","🔵", "🌹","✅","🇬🇧","💙", "✔","👉","🚨","❌",
                   "🇬🇧","🌳","🗳","👍","🗣","📢","📆","⬇","⤵",
                   "time", "go", "tonight", "morning", "minut*", "hour*", "amp", "http*", "@*", "#*", "even", "will", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "extra", "use", "rail", "today", "yesterday", "tomorrow", "now", "full",  "bring", "big", "say", "meet", "across", "add", "step", "s", "t", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "summer", "winter", "spring", "autumn", "ever", "pm", "am", "rt", "back", "like", "still", "see", "end", "first", "many", "come", "yet", "real", "find", "sure", "never", "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december", "day*", "week*", "month*", "always", "since", "become", "keep", "let", "can", "ahead", "new", "hard", "made", "ago", "just", "still", stopwords("english"))
# translate into dataframe en verwijder stopwoorden, punten en nummers
d_dfm <- dfm(d_corpus, remove = new_stopwords, remove_punct = TRUE, remove_numbers = TRUE, stem = TRUE, tolower = TRUE)

# LDA topic modelling
d_dtm <- convert(d_dfm, to = "topicmodels")

# # Gibbs sampling performs a random walk over the distribution so we need to set a seed to ensure reproducible results
# what follows requires some trial and error to 
burnin <- 2000 # set a burn in period of 2000 iteration which are discarded because they will probably not reflect the distribution accurately
iter <- 1000 # take the 200th iteration of the following 1000
thin <- 200 # take the 200th iteration of the following 1000
seed <- list(42,5,24,158,2500)
nstart <- 5 # five seeds for five independent runs
best <- TRUE

# LDA algorithm, first establish how many topics, then plot them
  
# compute LDA and perplexity
d_lda <-LDA(d_dtm, k=5, method="Gibbs", control=list(burnin=burnin, iter=iter, thin=thin, seed=seed, nstart=nstart, best=best))
#perplexity (is this a valid number to decide best number (k) on?; see attached figures)
perplexity(d_lda, newdata = d_dtm, estimate_theta=FALSE)
d_lda_topics <- tidy(d_lda, matrix="beta")

# visualize only the top ten words for each topic in a bar-plot
# make  each of these ten topics appear in a single graph.
d_lda_topterms <- d_lda_topics %>%
  group_by(topic) %>% # first take the data-set and group it by topic (each for every topic) using ``group_by`` command 
  top_n(20, beta) %>% # select the top 10 terms (based on their beta value), using ``top_n``
  ungroup() %>% # then ungroup again (to make R view it as a single data-set again), and use the ``arrange`` function to ensure the data-set has the topics sorted in an increasing fashion and the beta values in a decreasing fashion.
  arrange(topic, -beta)

# redefine the term variable so that it is re-ordered based first on the term and then on the beta value.
# (The result is a dataframe with first the first topic, then the second topic etc. and with the beta values ordered within each topic)
# split up the total graph

d_lda_topterms %>%
  mutate(term=reorder(term, beta)) %>% # we have to make sure that (seen from top to bottom), all the beta for the first topic come first, then for the second topic, etc.
  ggplot(aes(term, beta, fill=factor(topic))) + # make the figure, with the terms on the horizontal axis and the beta values and the vertical axes, and have the bars this generate coloured by topic.
  geom_col(show.legend=FALSE) + # switch off the legend (which we do not need)
  facet_wrap(~ topic, scales="free") + # set the options for the scales to be **free** as it might be that the beta values for some topics are larger or smaller than for the others
  coord_flip() # "flip" the graphs and make the x-axis the y-axis and vice versa


# ERROR calculate topic coherence for a given topic and create simple plot
topic_coherence(d_lda, d_dtm, top_n_tokens = 10,
                smoothing_beta = 1)

plot(topic_coherence)


# ERROR Using the pairwise_similarity() function from widyr, 
# calculate the cosine similarities for each top topic in the d_lda10_topterms column
comparisons <- d_lda %>%
  pairwise_similarity(d_lda, word, n) %>%
  arrange(desc(similarity))


#compare text similarities (makes sense if you group texts as different rows)
simil <- textstat_simil(d_dfm, d_dfm, 
                        margin = "documents", method = "correlation")
simil
#compare word associations
sim <- textstat_simil(d_dfm, d_dfm[, c("virus")], method = "cosine", margin = "features")
lapply(as.list(sim), head, 18)

#expected topic proportions:
quant_dfm <- dfm_trim(d_dfm, min_termfreq = 10, max_docfreq = 50)

set.seed(1111)
if (require(stm)) {
  my_lda_fit20 <- stm(quant_dfm, K = 5, verbose = TRUE)
  plot(my_lda_fit20)    
}

